{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "from model.db import DB_ENGINE\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "import logging\n",
    "import jieba\n",
    "import jieba.analyse\n",
    "from math import sqrt\n",
    "import os\n",
    "from pprint import pprint"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "from gensim.models.doc2vec import Doc2Vec"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "model = Doc2Vec.load('RuntimeTY/d2v_2048_5_1216')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "logging.basicConfig(level=logging.INFO, format='%(asctime)s %(message)s')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>rid</th>\n",
       "      <th>content</th>\n",
       "      <th>tag</th>\n",
       "      <th>assure</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>比特币现价41000元左右。至今年底最少跌去一半！立此帖为证。</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2</td>\n",
       "      <td>破5000是大概率事件</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>3</td>\n",
       "      <td>估计到时候都是非去中心化的币才是追捧的对象。没有信用背书的币还是不太靠谱。</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>4</td>\n",
       "      <td>出天涯钻，5毛一个</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>5</td>\n",
       "      <td></td>\n",
       "      <td>1.0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   rid                                content  tag  assure\n",
       "0    1        比特币现价41000元左右。至今年底最少跌去一半！立此帖为证。  1.0       1\n",
       "1    2                            破5000是大概率事件  1.0       1\n",
       "2    3  估计到时候都是非去中心化的币才是追捧的对象。没有信用背书的币还是不太靠谱。  0.0       1\n",
       "3    4                              出天涯钻，5毛一个  1.0       1\n",
       "4    5                                         1.0       1"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "raw_contents = pd.read_sql('SELECT rid, content, tag, assure FROM rawcontents', DB_ENGINE)\n",
    "raw_contents.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "                rid            tag    assure\n",
      "count  1.007420e+05  100742.000000  100742.0\n",
      "mean   5.841642e+05       0.981507       1.0\n",
      "std    4.699411e+05       0.134726       0.0\n",
      "min    1.000000e+00       0.000000       1.0\n",
      "25%    1.784752e+05       1.000000       1.0\n",
      "50%    4.258105e+05       1.000000       1.0\n",
      "75%    1.008566e+06       1.000000       1.0\n",
      "max    1.587615e+06       1.000000       1.0\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>rid</th>\n",
       "      <th>content</th>\n",
       "      <th>tag</th>\n",
       "      <th>assure</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>比特币现价41000元左右。至今年底最少跌去一半！立此帖为证。</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2</td>\n",
       "      <td>破5000是大概率事件</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>3</td>\n",
       "      <td>估计到时候都是非去中心化的币才是追捧的对象。没有信用背书的币还是不太靠谱。</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>4</td>\n",
       "      <td>出天涯钻，5毛一个</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>5</td>\n",
       "      <td></td>\n",
       "      <td>1.0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   rid                                content  tag  assure\n",
       "0    1        比特币现价41000元左右。至今年底最少跌去一半！立此帖为证。  1.0       1\n",
       "1    2                            破5000是大概率事件  1.0       1\n",
       "2    3  估计到时候都是非去中心化的币才是追捧的对象。没有信用背书的币还是不太靠谱。  0.0       1\n",
       "3    4                              出天涯钻，5毛一个  1.0       1\n",
       "4    5                                         1.0       1"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "tagged_data = raw_contents[raw_contents['assure'] > 0.5].copy()\n",
    "tagged_data = tagged_data.set_index(np.arange(len(tagged_data)))\n",
    "print(tagged_data.describe())\n",
    "tagged_data.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Fit"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.pipeline import Pipeline\n",
    "from sklearn.semi_supervised import LabelSpreading\n",
    "from sklearn.metrics import classification_report, confusion_matrix\n",
    "from sklearn.model_selection import cross_val_score, GridSearchCV, train_test_split"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "                rid            tag    assure             ss\n",
      "count  1.007420e+05  100742.000000  100742.0  100742.000000\n",
      "mean   5.841642e+05       0.981507       1.0      -0.799329\n",
      "std    4.699411e+05       0.134726       0.0       0.599391\n",
      "min    1.000000e+00       0.000000       1.0      -1.000000\n",
      "25%    1.784752e+05       1.000000       1.0      -1.000000\n",
      "50%    4.258105e+05       1.000000       1.0      -1.000000\n",
      "75%    1.008566e+06       1.000000       1.0      -1.000000\n",
      "max    1.587615e+06       1.000000       1.0       1.000000\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>rid</th>\n",
       "      <th>content</th>\n",
       "      <th>tag</th>\n",
       "      <th>assure</th>\n",
       "      <th>ss</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>比特币现价41000元左右。至今年底最少跌去一半！立此帖为证。</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1</td>\n",
       "      <td>-1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2</td>\n",
       "      <td>破5000是大概率事件</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1</td>\n",
       "      <td>-1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>3</td>\n",
       "      <td>估计到时候都是非去中心化的币才是追捧的对象。没有信用背书的币还是不太靠谱。</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1</td>\n",
       "      <td>-1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>4</td>\n",
       "      <td>出天涯钻，5毛一个</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1</td>\n",
       "      <td>-1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>5</td>\n",
       "      <td></td>\n",
       "      <td>1.0</td>\n",
       "      <td>1</td>\n",
       "      <td>-1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>6</td>\n",
       "      <td>探讨挖矿相关的技术，探讨区块链数字货币投资。展示行业最新动态，交流各自心得。共同提高对区块链...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1</td>\n",
       "      <td>-1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>7</td>\n",
       "      <td>有想法吗</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1</td>\n",
       "      <td>-1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>8</td>\n",
       "      <td>看名来</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1</td>\n",
       "      <td>1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>9</td>\n",
       "      <td>挖掘商业新形态，掌握赚钱新模式\\n \\n 重塑商业新格局，构建商业新思维\\n \\n 抢占区块...</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1</td>\n",
       "      <td>-1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>10</td>\n",
       "      <td>币圈经过多年发展已经被头部交易所垄断，新币若想上去，需要花很多钱。\\n \\n 既然区块链是传...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1</td>\n",
       "      <td>-1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10</th>\n",
       "      <td>11</td>\n",
       "      <td>不太现实，因为好多币种都是圈钱的，如果平台都给免费上去没有任何的检测什么的，那平台肯定做不长...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1</td>\n",
       "      <td>-1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>11</th>\n",
       "      <td>12</td>\n",
       "      <td>依楼主的想法，股票也可以到淘宝开店卖？</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1</td>\n",
       "      <td>-1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>12</th>\n",
       "      <td>13</td>\n",
       "      <td>@yangzhenjun 2018-07-16 10:02:08\\n \\n 依楼主的想法，股...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1</td>\n",
       "      <td>-1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>13</th>\n",
       "      <td>14</td>\n",
       "      <td>币圈从去年的牛市变成今天的熊市，都说币圈一天人间一年，但半年过去了，币圈还是没有转牛的迹象。...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1</td>\n",
       "      <td>-1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>14</th>\n",
       "      <td>15</td>\n",
       "      <td>牛市要来了</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1</td>\n",
       "      <td>1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>15</th>\n",
       "      <td>16</td>\n",
       "      <td>在熊市的时候，其实是自己做项目做有机会的时候，因为这个时候就是一些已经老牌的项目也会受到波及...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1</td>\n",
       "      <td>-1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>16</th>\n",
       "      <td>17</td>\n",
       "      <td>当全世界的目光被世界杯吸引之时，“币圈”里一则重磅消息正在击打着很多人的心理防线。6月28日...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1</td>\n",
       "      <td>-1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>17</th>\n",
       "      <td>18</td>\n",
       "      <td>快蚁全球首款智能炒币机器人\\n \\n\\n AntBeast科技公司(香港)成立于2016年...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1</td>\n",
       "      <td>-1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>18</th>\n",
       "      <td>19</td>\n",
       "      <td>赶快解放双手！快蚁区块链智能交易\\n \\n 人们常说，金融交易中无论是股票、外汇还是加密货币...</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1</td>\n",
       "      <td>-1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>19</th>\n",
       "      <td>20</td>\n",
       "      <td>快蚁区块链智能交易系统到底有多神奇，让无数炒币人疯狂追捧\\n \\n\\n 有很多人认为在数字货...</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1</td>\n",
       "      <td>-1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>20</th>\n",
       "      <td>21</td>\n",
       "      <td>包头火电矿场招矿机托管\\n \\n\\n 量大有优惠\\n \\n\\n 不收矿机上机费及管理费\\n ...</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1</td>\n",
       "      <td>1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>21</th>\n",
       "      <td>22</td>\n",
       "      <td>这个项目不像空气币或者传销币那样子，本人亲自玩儿了两个月的项目，大概这一周他们还会推新的项目...</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1</td>\n",
       "      <td>1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>22</th>\n",
       "      <td>23</td>\n",
       "      <td>zm328507854</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1</td>\n",
       "      <td>-1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>23</th>\n",
       "      <td>24</td>\n",
       "      <td>区块链安全性的工作方式如下：平台上的每个交互都被记录下来，并且事务中的每条记录都与前一条记录...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1</td>\n",
       "      <td>-1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>24</th>\n",
       "      <td>25</td>\n",
       "      <td></td>\n",
       "      <td>1.0</td>\n",
       "      <td>1</td>\n",
       "      <td>-1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>25</th>\n",
       "      <td>26</td>\n",
       "      <td>人民日报为促进区块链产业健康发展发声</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1</td>\n",
       "      <td>-1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>26</th>\n",
       "      <td>27</td>\n",
       "      <td>共识机制是维护系统的运作顺序和公平性的机制，决定这区块链这个系统里的区块被准确添加到链中，确...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1</td>\n",
       "      <td>-1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>27</th>\n",
       "      <td>28</td>\n",
       "      <td></td>\n",
       "      <td>1.0</td>\n",
       "      <td>1</td>\n",
       "      <td>-1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>28</th>\n",
       "      <td>29</td>\n",
       "      <td>区块链是21世纪最前沿的现象级概念，（加 jingshui902，一起加群来交流吧）概述区块...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1</td>\n",
       "      <td>-1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>29</th>\n",
       "      <td>30</td>\n",
       "      <td>币圈众筹是由ICO发展而来的，2017年9月4日起，国内禁止ICO行为，在各个网站和主流渠道...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1</td>\n",
       "      <td>-1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>100712</th>\n",
       "      <td>1586409</td>\n",
       "      <td></td>\n",
       "      <td>1.0</td>\n",
       "      <td>1</td>\n",
       "      <td>-1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>100713</th>\n",
       "      <td>1586416</td>\n",
       "      <td></td>\n",
       "      <td>1.0</td>\n",
       "      <td>1</td>\n",
       "      <td>-1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>100714</th>\n",
       "      <td>1586506</td>\n",
       "      <td></td>\n",
       "      <td>1.0</td>\n",
       "      <td>1</td>\n",
       "      <td>-1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>100715</th>\n",
       "      <td>1586522</td>\n",
       "      <td></td>\n",
       "      <td>1.0</td>\n",
       "      <td>1</td>\n",
       "      <td>-1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>100716</th>\n",
       "      <td>1586536</td>\n",
       "      <td></td>\n",
       "      <td>1.0</td>\n",
       "      <td>1</td>\n",
       "      <td>-1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>100717</th>\n",
       "      <td>1586564</td>\n",
       "      <td></td>\n",
       "      <td>1.0</td>\n",
       "      <td>1</td>\n",
       "      <td>-1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>100718</th>\n",
       "      <td>1586600</td>\n",
       "      <td></td>\n",
       "      <td>1.0</td>\n",
       "      <td>1</td>\n",
       "      <td>-1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>100719</th>\n",
       "      <td>1586611</td>\n",
       "      <td></td>\n",
       "      <td>1.0</td>\n",
       "      <td>1</td>\n",
       "      <td>-1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>100720</th>\n",
       "      <td>1586618</td>\n",
       "      <td></td>\n",
       "      <td>1.0</td>\n",
       "      <td>1</td>\n",
       "      <td>-1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>100721</th>\n",
       "      <td>1586647</td>\n",
       "      <td>赞过。。。。</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1</td>\n",
       "      <td>-1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>100722</th>\n",
       "      <td>1586671</td>\n",
       "      <td></td>\n",
       "      <td>1.0</td>\n",
       "      <td>1</td>\n",
       "      <td>-1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>100723</th>\n",
       "      <td>1586680</td>\n",
       "      <td></td>\n",
       "      <td>1.0</td>\n",
       "      <td>1</td>\n",
       "      <td>-1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>100724</th>\n",
       "      <td>1586686</td>\n",
       "      <td></td>\n",
       "      <td>1.0</td>\n",
       "      <td>1</td>\n",
       "      <td>-1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>100725</th>\n",
       "      <td>1586708</td>\n",
       "      <td></td>\n",
       "      <td>1.0</td>\n",
       "      <td>1</td>\n",
       "      <td>-1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>100726</th>\n",
       "      <td>1586815</td>\n",
       "      <td>已赞请回</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1</td>\n",
       "      <td>-1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>100727</th>\n",
       "      <td>1586847</td>\n",
       "      <td></td>\n",
       "      <td>1.0</td>\n",
       "      <td>1</td>\n",
       "      <td>-1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>100728</th>\n",
       "      <td>1586919</td>\n",
       "      <td></td>\n",
       "      <td>1.0</td>\n",
       "      <td>1</td>\n",
       "      <td>-1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>100729</th>\n",
       "      <td>1586940</td>\n",
       "      <td></td>\n",
       "      <td>1.0</td>\n",
       "      <td>1</td>\n",
       "      <td>-1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>100730</th>\n",
       "      <td>1586980</td>\n",
       "      <td>赞</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1</td>\n",
       "      <td>-1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>100731</th>\n",
       "      <td>1587182</td>\n",
       "      <td></td>\n",
       "      <td>1.0</td>\n",
       "      <td>1</td>\n",
       "      <td>-1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>100732</th>\n",
       "      <td>1587227</td>\n",
       "      <td></td>\n",
       "      <td>1.0</td>\n",
       "      <td>1</td>\n",
       "      <td>-1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>100733</th>\n",
       "      <td>1587278</td>\n",
       "      <td></td>\n",
       "      <td>1.0</td>\n",
       "      <td>1</td>\n",
       "      <td>1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>100734</th>\n",
       "      <td>1587282</td>\n",
       "      <td></td>\n",
       "      <td>1.0</td>\n",
       "      <td>1</td>\n",
       "      <td>-1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>100735</th>\n",
       "      <td>1587288</td>\n",
       "      <td></td>\n",
       "      <td>1.0</td>\n",
       "      <td>1</td>\n",
       "      <td>-1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>100736</th>\n",
       "      <td>1587295</td>\n",
       "      <td></td>\n",
       "      <td>1.0</td>\n",
       "      <td>1</td>\n",
       "      <td>-1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>100737</th>\n",
       "      <td>1587349</td>\n",
       "      <td></td>\n",
       "      <td>1.0</td>\n",
       "      <td>1</td>\n",
       "      <td>-1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>100738</th>\n",
       "      <td>1587372</td>\n",
       "      <td></td>\n",
       "      <td>1.0</td>\n",
       "      <td>1</td>\n",
       "      <td>-1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>100739</th>\n",
       "      <td>1587450</td>\n",
       "      <td></td>\n",
       "      <td>1.0</td>\n",
       "      <td>1</td>\n",
       "      <td>1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>100740</th>\n",
       "      <td>1587477</td>\n",
       "      <td>赞</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1</td>\n",
       "      <td>-1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>100741</th>\n",
       "      <td>1587615</td>\n",
       "      <td>点赞领红包</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1</td>\n",
       "      <td>-1.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>100742 rows × 5 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "            rid                                            content  tag  \\\n",
       "0             1                    比特币现价41000元左右。至今年底最少跌去一半！立此帖为证。  1.0   \n",
       "1             2                                        破5000是大概率事件  1.0   \n",
       "2             3              估计到时候都是非去中心化的币才是追捧的对象。没有信用背书的币还是不太靠谱。  0.0   \n",
       "3             4                                          出天涯钻，5毛一个  1.0   \n",
       "4             5                                                     1.0   \n",
       "5             6  探讨挖矿相关的技术，探讨区块链数字货币投资。展示行业最新动态，交流各自心得。共同提高对区块链...  0.0   \n",
       "6             7                                               有想法吗  1.0   \n",
       "7             8                                                看名来  1.0   \n",
       "8             9  挖掘商业新形态，掌握赚钱新模式\\n \\n 重塑商业新格局，构建商业新思维\\n \\n 抢占区块...  1.0   \n",
       "9            10  币圈经过多年发展已经被头部交易所垄断，新币若想上去，需要花很多钱。\\n \\n 既然区块链是传...  0.0   \n",
       "10           11  不太现实，因为好多币种都是圈钱的，如果平台都给免费上去没有任何的检测什么的，那平台肯定做不长...  0.0   \n",
       "11           12                                依楼主的想法，股票也可以到淘宝开店卖？  1.0   \n",
       "12           13  @yangzhenjun 2018-07-16 10:02:08\\n \\n 依楼主的想法，股...  0.0   \n",
       "13           14  币圈从去年的牛市变成今天的熊市，都说币圈一天人间一年，但半年过去了，币圈还是没有转牛的迹象。...  0.0   \n",
       "14           15                                              牛市要来了  1.0   \n",
       "15           16  在熊市的时候，其实是自己做项目做有机会的时候，因为这个时候就是一些已经老牌的项目也会受到波及...  0.0   \n",
       "16           17  当全世界的目光被世界杯吸引之时，“币圈”里一则重磅消息正在击打着很多人的心理防线。6月28日...  0.0   \n",
       "17           18  快蚁全球首款智能炒币机器人\\n \\n\\n AntBeast科技公司(香港)成立于2016年...  0.0   \n",
       "18           19  赶快解放双手！快蚁区块链智能交易\\n \\n 人们常说，金融交易中无论是股票、外汇还是加密货币...  1.0   \n",
       "19           20  快蚁区块链智能交易系统到底有多神奇，让无数炒币人疯狂追捧\\n \\n\\n 有很多人认为在数字货...  1.0   \n",
       "20           21  包头火电矿场招矿机托管\\n \\n\\n 量大有优惠\\n \\n\\n 不收矿机上机费及管理费\\n ...  1.0   \n",
       "21           22  这个项目不像空气币或者传销币那样子，本人亲自玩儿了两个月的项目，大概这一周他们还会推新的项目...  1.0   \n",
       "22           23                                        zm328507854  1.0   \n",
       "23           24  区块链安全性的工作方式如下：平台上的每个交互都被记录下来，并且事务中的每条记录都与前一条记录...  0.0   \n",
       "24           25                                                     1.0   \n",
       "25           26                                 人民日报为促进区块链产业健康发展发声  0.0   \n",
       "26           27  共识机制是维护系统的运作顺序和公平性的机制，决定这区块链这个系统里的区块被准确添加到链中，确...  0.0   \n",
       "27           28                                                     1.0   \n",
       "28           29  区块链是21世纪最前沿的现象级概念，（加 jingshui902，一起加群来交流吧）概述区块...  0.0   \n",
       "29           30  币圈众筹是由ICO发展而来的，2017年9月4日起，国内禁止ICO行为，在各个网站和主流渠道...  0.0   \n",
       "...         ...                                                ...  ...   \n",
       "100712  1586409                                                     1.0   \n",
       "100713  1586416                                                     1.0   \n",
       "100714  1586506                                                     1.0   \n",
       "100715  1586522                                                     1.0   \n",
       "100716  1586536                                                     1.0   \n",
       "100717  1586564                                                     1.0   \n",
       "100718  1586600                                                     1.0   \n",
       "100719  1586611                                                     1.0   \n",
       "100720  1586618                                                     1.0   \n",
       "100721  1586647                                             赞过。。。。  1.0   \n",
       "100722  1586671                                                     1.0   \n",
       "100723  1586680                                                     1.0   \n",
       "100724  1586686                                                     1.0   \n",
       "100725  1586708                                                     1.0   \n",
       "100726  1586815                                               已赞请回  1.0   \n",
       "100727  1586847                                                     1.0   \n",
       "100728  1586919                                                     1.0   \n",
       "100729  1586940                                                     1.0   \n",
       "100730  1586980                                                  赞  1.0   \n",
       "100731  1587182                                                     1.0   \n",
       "100732  1587227                                                     1.0   \n",
       "100733  1587278                                                     1.0   \n",
       "100734  1587282                                                     1.0   \n",
       "100735  1587288                                                     1.0   \n",
       "100736  1587295                                                     1.0   \n",
       "100737  1587349                                                     1.0   \n",
       "100738  1587372                                                     1.0   \n",
       "100739  1587450                                                     1.0   \n",
       "100740  1587477                                                  赞  1.0   \n",
       "100741  1587615                                              点赞领红包  1.0   \n",
       "\n",
       "        assure   ss  \n",
       "0            1 -1.0  \n",
       "1            1 -1.0  \n",
       "2            1 -1.0  \n",
       "3            1 -1.0  \n",
       "4            1 -1.0  \n",
       "5            1 -1.0  \n",
       "6            1 -1.0  \n",
       "7            1  1.0  \n",
       "8            1 -1.0  \n",
       "9            1 -1.0  \n",
       "10           1 -1.0  \n",
       "11           1 -1.0  \n",
       "12           1 -1.0  \n",
       "13           1 -1.0  \n",
       "14           1  1.0  \n",
       "15           1 -1.0  \n",
       "16           1 -1.0  \n",
       "17           1 -1.0  \n",
       "18           1 -1.0  \n",
       "19           1 -1.0  \n",
       "20           1  1.0  \n",
       "21           1  1.0  \n",
       "22           1 -1.0  \n",
       "23           1 -1.0  \n",
       "24           1 -1.0  \n",
       "25           1 -1.0  \n",
       "26           1 -1.0  \n",
       "27           1 -1.0  \n",
       "28           1 -1.0  \n",
       "29           1 -1.0  \n",
       "...        ...  ...  \n",
       "100712       1 -1.0  \n",
       "100713       1 -1.0  \n",
       "100714       1 -1.0  \n",
       "100715       1 -1.0  \n",
       "100716       1 -1.0  \n",
       "100717       1 -1.0  \n",
       "100718       1 -1.0  \n",
       "100719       1 -1.0  \n",
       "100720       1 -1.0  \n",
       "100721       1 -1.0  \n",
       "100722       1 -1.0  \n",
       "100723       1 -1.0  \n",
       "100724       1 -1.0  \n",
       "100725       1 -1.0  \n",
       "100726       1 -1.0  \n",
       "100727       1 -1.0  \n",
       "100728       1 -1.0  \n",
       "100729       1 -1.0  \n",
       "100730       1 -1.0  \n",
       "100731       1 -1.0  \n",
       "100732       1 -1.0  \n",
       "100733       1  1.0  \n",
       "100734       1 -1.0  \n",
       "100735       1 -1.0  \n",
       "100736       1 -1.0  \n",
       "100737       1 -1.0  \n",
       "100738       1 -1.0  \n",
       "100739       1  1.0  \n",
       "100740       1 -1.0  \n",
       "100741       1 -1.0  \n",
       "\n",
       "[100742 rows x 5 columns]"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "scale = 0.1\n",
    "tagged_data\n",
    "mask = np.random.random(len(tagged_data))\n",
    "tagged_data['ss'] = list(map(lambda x: -1 if x[0] > 0.1 else x[1], zip(mask, tagged_data['tag'])))\n",
    "train_data = tagged_data[tagged_data['ss'] > 0]\n",
    "test_data = tagged_data[tagged_data['ss'] < 0]\n",
    "\n",
    "print(tagged_data.describe())\n",
    "tagged_data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "ss_X = list(map(model.infer_vector, tagged_data['content'].values))\n",
    "ss_y = tagged_data['ss'].values"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "ename": "MemoryError",
     "evalue": "",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mMemoryError\u001b[0m                               Traceback (most recent call last)",
      "\u001b[0;32m<ipython-input-10-a8b2be5ed7e0>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m      1\u001b[0m \u001b[0mlp_model\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mLabelSpreading\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmax_iter\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m128\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtol\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m1e-3\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mn_jobs\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m-\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 2\u001b[0;31m \u001b[0mlp_model\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfit\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mss_X\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mss_y\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
      "\u001b[0;32m~/.local/lib/python3.6/site-packages/sklearn/semi_supervised/label_propagation.py\u001b[0m in \u001b[0;36mfit\u001b[0;34m(self, X, y)\u001b[0m\n\u001b[1;32m    231\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    232\u001b[0m         \u001b[0;31m# actual graph construction (implementations should override this)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 233\u001b[0;31m         \u001b[0mgraph_matrix\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_build_graph\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    234\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    235\u001b[0m         \u001b[0;31m# label construction\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;32m~/.local/lib/python3.6/site-packages/sklearn/semi_supervised/label_propagation.py\u001b[0m in \u001b[0;36m_build_graph\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m    524\u001b[0m             \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mnn_fit\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    525\u001b[0m         \u001b[0mn_samples\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mX_\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mshape\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 526\u001b[0;31m         \u001b[0maffinity_matrix\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_get_kernel\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mX_\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    527\u001b[0m         \u001b[0mlaplacian\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mcsgraph\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mlaplacian\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0maffinity_matrix\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mnormed\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mTrue\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    528\u001b[0m         \u001b[0mlaplacian\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m-\u001b[0m\u001b[0mlaplacian\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;32m~/.local/lib/python3.6/site-packages/sklearn/semi_supervised/label_propagation.py\u001b[0m in \u001b[0;36m_get_kernel\u001b[0;34m(self, X, y)\u001b[0m\n\u001b[1;32m    128\u001b[0m         \u001b[0;32mif\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mkernel\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0;34m\"rbf\"\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    129\u001b[0m             \u001b[0;32mif\u001b[0m \u001b[0my\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 130\u001b[0;31m                 \u001b[0;32mreturn\u001b[0m \u001b[0mrbf_kernel\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mX\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mX\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mgamma\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mgamma\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    131\u001b[0m             \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    132\u001b[0m                 \u001b[0;32mreturn\u001b[0m \u001b[0mrbf_kernel\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mX\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mgamma\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mgamma\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;32m~/.local/lib/python3.6/site-packages/sklearn/metrics/pairwise.py\u001b[0m in \u001b[0;36mrbf_kernel\u001b[0;34m(X, Y, gamma)\u001b[0m\n\u001b[1;32m    819\u001b[0m         \u001b[0mgamma\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;36m1.0\u001b[0m \u001b[0;34m/\u001b[0m \u001b[0mX\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mshape\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    820\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 821\u001b[0;31m     \u001b[0mK\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0meuclidean_distances\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mX\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mY\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0msquared\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mTrue\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    822\u001b[0m     \u001b[0mK\u001b[0m \u001b[0;34m*=\u001b[0m \u001b[0;34m-\u001b[0m\u001b[0mgamma\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    823\u001b[0m     \u001b[0mnp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mexp\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mK\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mK\u001b[0m\u001b[0;34m)\u001b[0m    \u001b[0;31m# exponentiate K in-place\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;32m~/.local/lib/python3.6/site-packages/sklearn/metrics/pairwise.py\u001b[0m in \u001b[0;36meuclidean_distances\u001b[0;34m(X, Y, Y_norm_squared, squared, X_norm_squared)\u001b[0m\n\u001b[1;32m    245\u001b[0m         \u001b[0mYY\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mrow_norms\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mY\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0msquared\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mTrue\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mnp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mnewaxis\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m:\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    246\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 247\u001b[0;31m     \u001b[0mdistances\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0msafe_sparse_dot\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mX\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mY\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mT\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdense_output\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mTrue\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    248\u001b[0m     \u001b[0mdistances\u001b[0m \u001b[0;34m*=\u001b[0m \u001b[0;34m-\u001b[0m\u001b[0;36m2\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    249\u001b[0m     \u001b[0mdistances\u001b[0m \u001b[0;34m+=\u001b[0m \u001b[0mXX\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;32m~/.local/lib/python3.6/site-packages/sklearn/utils/extmath.py\u001b[0m in \u001b[0;36msafe_sparse_dot\u001b[0;34m(a, b, dense_output)\u001b[0m\n\u001b[1;32m    171\u001b[0m         \u001b[0;32mreturn\u001b[0m \u001b[0mret\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    172\u001b[0m     \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 173\u001b[0;31m         \u001b[0;32mreturn\u001b[0m \u001b[0mnp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdot\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0ma\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mb\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    174\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    175\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;31mMemoryError\u001b[0m: "
     ]
    }
   ],
   "source": [
    "lp_model = LabelSpreading(max_iter=128, tol=1e-3, n_jobs=-1)\n",
    "lp_model.fit(ss_X, ss_y)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "y_pred = lp_model.transduction_[test_data.index]\n",
    "res = np.array(y_pred != test_data['tag'].values)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "res.mean()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from scipy import stats"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "unlabeled_indices = test_data.index"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "pred_entropies = stats.distributions.entropy(lp_model.label_distributions_.T)\n",
    "uncertainty_index = np.argsort(pred_entropies)[::1]\n",
    "uncertainty_index = uncertainty_index[np.in1d(uncertainty_index, unlabeled_indices)][:2000]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "uncertainty_index"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Spread"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "from model.db import DB_ENGINE, rawcontents\n",
    "from sqlalchemy import update"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>rid</th>\n",
       "      <th>tag</th>\n",
       "      <th>assure</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>count</th>\n",
       "      <td>1.007420e+05</td>\n",
       "      <td>100742.000000</td>\n",
       "      <td>100742.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>mean</th>\n",
       "      <td>5.841642e+05</td>\n",
       "      <td>0.981507</td>\n",
       "      <td>1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>std</th>\n",
       "      <td>4.699411e+05</td>\n",
       "      <td>0.134726</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>min</th>\n",
       "      <td>1.000000e+00</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>25%</th>\n",
       "      <td>1.784752e+05</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>50%</th>\n",
       "      <td>4.258105e+05</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>75%</th>\n",
       "      <td>1.008566e+06</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>max</th>\n",
       "      <td>1.587615e+06</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>1.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                rid            tag    assure\n",
       "count  1.007420e+05  100742.000000  100742.0\n",
       "mean   5.841642e+05       0.981507       1.0\n",
       "std    4.699411e+05       0.134726       0.0\n",
       "min    1.000000e+00       0.000000       1.0\n",
       "25%    1.784752e+05       1.000000       1.0\n",
       "50%    4.258105e+05       1.000000       1.0\n",
       "75%    1.008566e+06       1.000000       1.0\n",
       "max    1.587615e+06       1.000000       1.0"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "raw_contents = pd.read_sql('SELECT rid, content, tag, assure FROM rawcontents', DB_ENGINE)\n",
    "unlabeled_data = raw_contents[raw_contents['assure'] < 0.5].copy()\n",
    "labled_data = raw_contents[raw_contents['assure'] > 0.5].copy()\n",
    "labled_data.describe()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>rid</th>\n",
       "      <th>content</th>\n",
       "      <th>tag</th>\n",
       "      <th>assure</th>\n",
       "      <th>ss</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>比特币现价41000元左右。至今年底最少跌去一半！立此帖为证。</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1</td>\n",
       "      <td>1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2</td>\n",
       "      <td>破5000是大概率事件</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1</td>\n",
       "      <td>1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>3</td>\n",
       "      <td>估计到时候都是非去中心化的币才是追捧的对象。没有信用背书的币还是不太靠谱。</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>4</td>\n",
       "      <td>出天涯钻，5毛一个</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1</td>\n",
       "      <td>1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>5</td>\n",
       "      <td></td>\n",
       "      <td>1.0</td>\n",
       "      <td>1</td>\n",
       "      <td>1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>6</td>\n",
       "      <td>探讨挖矿相关的技术，探讨区块链数字货币投资。展示行业最新动态，交流各自心得。共同提高对区块链...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>7</td>\n",
       "      <td>有想法吗</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1</td>\n",
       "      <td>1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>8</td>\n",
       "      <td>看名来</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1</td>\n",
       "      <td>1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>9</td>\n",
       "      <td>挖掘商业新形态，掌握赚钱新模式\\n \\n 重塑商业新格局，构建商业新思维\\n \\n 抢占区块...</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1</td>\n",
       "      <td>1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>10</td>\n",
       "      <td>币圈经过多年发展已经被头部交易所垄断，新币若想上去，需要花很多钱。\\n \\n 既然区块链是传...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10</th>\n",
       "      <td>11</td>\n",
       "      <td>不太现实，因为好多币种都是圈钱的，如果平台都给免费上去没有任何的检测什么的，那平台肯定做不长...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>11</th>\n",
       "      <td>12</td>\n",
       "      <td>依楼主的想法，股票也可以到淘宝开店卖？</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1</td>\n",
       "      <td>1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>12</th>\n",
       "      <td>13</td>\n",
       "      <td>@yangzhenjun 2018-07-16 10:02:08\\n \\n 依楼主的想法，股...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>13</th>\n",
       "      <td>14</td>\n",
       "      <td>币圈从去年的牛市变成今天的熊市，都说币圈一天人间一年，但半年过去了，币圈还是没有转牛的迹象。...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>14</th>\n",
       "      <td>15</td>\n",
       "      <td>牛市要来了</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1</td>\n",
       "      <td>1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>15</th>\n",
       "      <td>16</td>\n",
       "      <td>在熊市的时候，其实是自己做项目做有机会的时候，因为这个时候就是一些已经老牌的项目也会受到波及...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>16</th>\n",
       "      <td>17</td>\n",
       "      <td>当全世界的目光被世界杯吸引之时，“币圈”里一则重磅消息正在击打着很多人的心理防线。6月28日...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>17</th>\n",
       "      <td>18</td>\n",
       "      <td>快蚁全球首款智能炒币机器人\\n \\n\\n AntBeast科技公司(香港)成立于2016年...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>18</th>\n",
       "      <td>19</td>\n",
       "      <td>赶快解放双手！快蚁区块链智能交易\\n \\n 人们常说，金融交易中无论是股票、外汇还是加密货币...</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1</td>\n",
       "      <td>1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>19</th>\n",
       "      <td>20</td>\n",
       "      <td>快蚁区块链智能交易系统到底有多神奇，让无数炒币人疯狂追捧\\n \\n\\n 有很多人认为在数字货...</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1</td>\n",
       "      <td>1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>20</th>\n",
       "      <td>21</td>\n",
       "      <td>包头火电矿场招矿机托管\\n \\n\\n 量大有优惠\\n \\n\\n 不收矿机上机费及管理费\\n ...</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1</td>\n",
       "      <td>1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>21</th>\n",
       "      <td>22</td>\n",
       "      <td>这个项目不像空气币或者传销币那样子，本人亲自玩儿了两个月的项目，大概这一周他们还会推新的项目...</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1</td>\n",
       "      <td>1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>22</th>\n",
       "      <td>23</td>\n",
       "      <td>zm328507854</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1</td>\n",
       "      <td>1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>23</th>\n",
       "      <td>24</td>\n",
       "      <td>区块链安全性的工作方式如下：平台上的每个交互都被记录下来，并且事务中的每条记录都与前一条记录...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>24</th>\n",
       "      <td>25</td>\n",
       "      <td></td>\n",
       "      <td>1.0</td>\n",
       "      <td>1</td>\n",
       "      <td>1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>25</th>\n",
       "      <td>26</td>\n",
       "      <td>人民日报为促进区块链产业健康发展发声</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>26</th>\n",
       "      <td>27</td>\n",
       "      <td>共识机制是维护系统的运作顺序和公平性的机制，决定这区块链这个系统里的区块被准确添加到链中，确...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>27</th>\n",
       "      <td>28</td>\n",
       "      <td></td>\n",
       "      <td>1.0</td>\n",
       "      <td>1</td>\n",
       "      <td>1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>28</th>\n",
       "      <td>29</td>\n",
       "      <td>区块链是21世纪最前沿的现象级概念，（加 jingshui902，一起加群来交流吧）概述区块...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>29</th>\n",
       "      <td>30</td>\n",
       "      <td>币圈众筹是由ICO发展而来的，2017年9月4日起，国内禁止ICO行为，在各个网站和主流渠道...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>102760</th>\n",
       "      <td>720906</td>\n",
       "      <td>清爽的姑娘</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "      <td>-1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>102761</th>\n",
       "      <td>653682</td>\n",
       "      <td>降啊 使劲降\\n \\n 原来抽35的万宝路双爆\\n \\n 现在抽爱喜妈的\\n \\n 搬砖去了</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "      <td>-1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>102762</th>\n",
       "      <td>1545747</td>\n",
       "      <td>叙利亚政府军不断收复失地，即将对叛军控制的伊德利卜省发起攻势。美国总统特朗普日前发推警告叙利...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "      <td>-1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>102763</th>\n",
       "      <td>306294</td>\n",
       "      <td>楼主文章不错！值得投票点赞！</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "      <td>-1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>102764</th>\n",
       "      <td>107756</td>\n",
       "      <td>学习，支持。</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "      <td>-1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>102765</th>\n",
       "      <td>336596</td>\n",
       "      <td>楼上说的对。</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "      <td>-1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>102766</th>\n",
       "      <td>511899</td>\n",
       "      <td>都是一堆问题哪</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "      <td>-1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>102767</th>\n",
       "      <td>90851</td>\n",
       "      <td>不错。大气的老板。</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "      <td>-1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>102768</th>\n",
       "      <td>518328</td>\n",
       "      <td>中新经纬客户端10月27日电 房地产市场热度消散，近期公布的相关数据已有所体现。易居研究院2...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "      <td>-1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>102769</th>\n",
       "      <td>190150</td>\n",
       "      <td>希望天涯越来越好，盼回</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "      <td>-1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>102770</th>\n",
       "      <td>154462</td>\n",
       "      <td>爱天涯，爱洛洛，没抢到红包也支持楼主！</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "      <td>-1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>102771</th>\n",
       "      <td>1565787</td>\n",
       "      <td>美国逐步废除自己建立的国际秩序漏出了他本就狰狞的面目。</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "      <td>-1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>102772</th>\n",
       "      <td>579754</td>\n",
       "      <td>没钱</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "      <td>-1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>102773</th>\n",
       "      <td>758785</td>\n",
       "      <td>见到帖就想回，不回不痛快！！！</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "      <td>-1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>102774</th>\n",
       "      <td>23731</td>\n",
       "      <td>口令  一起去看海\\n   \\n\\n    抢红包</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "      <td>-1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>102775</th>\n",
       "      <td>835774</td>\n",
       "      <td>都是用娘老子的钱贷款买的豪车，没什么好羡慕的！！！</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "      <td>-1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>102776</th>\n",
       "      <td>1203600</td>\n",
       "      <td>阿庆剑指和珅，此时子时已到，这一天又过去了\\n \\n 和珅生命剩余25天</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "      <td>-1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>102777</th>\n",
       "      <td>546776</td>\n",
       "      <td>硬碰硬的正面决斗3\\n \\n\\n 一直紧盯着“两洋联盟”接应舰队进攻阵容的“曼莱督”一级军令...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "      <td>-1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>102778</th>\n",
       "      <td>279535</td>\n",
       "      <td>恭喜发财，大吉大利！\\n   \\n\\n    抢红包</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "      <td>-1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>102779</th>\n",
       "      <td>254303</td>\n",
       "      <td>楼主的帖子我认证看了一遍，楼主文采是如此的出众</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "      <td>-1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>102780</th>\n",
       "      <td>556800</td>\n",
       "      <td>说得好且对！福兄好！</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "      <td>-1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>102781</th>\n",
       "      <td>1428148</td>\n",
       "      <td>小偷哪里都有，不过为了一块钱的东西打成这样真的罕见。</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "      <td>-1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>102782</th>\n",
       "      <td>902297</td>\n",
       "      <td>“大姐，您怎么了？”百草堂内一名女导购员，立刻跑过来着急问。\\n \\n\\n “快叫救护车。”...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "      <td>-1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>102783</th>\n",
       "      <td>5826</td>\n",
       "      <td>来吧~~15连击点，我会回点~~每天都可以相互点哟！</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "      <td>-1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>102784</th>\n",
       "      <td>423435</td>\n",
       "      <td>说一说看到的新闻吧，对于虚拟币一定要理性看待</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "      <td>-1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>102785</th>\n",
       "      <td>929590</td>\n",
       "      <td>小时候一直期盼着长大，可以做喜欢的事，长大后才知道童年是最幸福的时光。非常喜欢张嘉佳这个作者...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "      <td>-1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>102786</th>\n",
       "      <td>740720</td>\n",
       "      <td>昆山标杆！</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "      <td>-1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>102787</th>\n",
       "      <td>1217125</td>\n",
       "      <td>这尺度真是大</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "      <td>-1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>102788</th>\n",
       "      <td>400845</td>\n",
       "      <td>@潇洒大佬 2018-09-13 09:09:25\\n \\n 支持楼主。填楼\\n \\n --...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "      <td>-1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>102789</th>\n",
       "      <td>228576</td>\n",
       "      <td>已经点赞楼主，请楼主记得友情回赞</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "      <td>-1.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>102790 rows × 5 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "            rid                                            content  tag  \\\n",
       "0             1                    比特币现价41000元左右。至今年底最少跌去一半！立此帖为证。  1.0   \n",
       "1             2                                        破5000是大概率事件  1.0   \n",
       "2             3              估计到时候都是非去中心化的币才是追捧的对象。没有信用背书的币还是不太靠谱。  0.0   \n",
       "3             4                                          出天涯钻，5毛一个  1.0   \n",
       "4             5                                                     1.0   \n",
       "5             6  探讨挖矿相关的技术，探讨区块链数字货币投资。展示行业最新动态，交流各自心得。共同提高对区块链...  0.0   \n",
       "6             7                                               有想法吗  1.0   \n",
       "7             8                                                看名来  1.0   \n",
       "8             9  挖掘商业新形态，掌握赚钱新模式\\n \\n 重塑商业新格局，构建商业新思维\\n \\n 抢占区块...  1.0   \n",
       "9            10  币圈经过多年发展已经被头部交易所垄断，新币若想上去，需要花很多钱。\\n \\n 既然区块链是传...  0.0   \n",
       "10           11  不太现实，因为好多币种都是圈钱的，如果平台都给免费上去没有任何的检测什么的，那平台肯定做不长...  0.0   \n",
       "11           12                                依楼主的想法，股票也可以到淘宝开店卖？  1.0   \n",
       "12           13  @yangzhenjun 2018-07-16 10:02:08\\n \\n 依楼主的想法，股...  0.0   \n",
       "13           14  币圈从去年的牛市变成今天的熊市，都说币圈一天人间一年，但半年过去了，币圈还是没有转牛的迹象。...  0.0   \n",
       "14           15                                              牛市要来了  1.0   \n",
       "15           16  在熊市的时候，其实是自己做项目做有机会的时候，因为这个时候就是一些已经老牌的项目也会受到波及...  0.0   \n",
       "16           17  当全世界的目光被世界杯吸引之时，“币圈”里一则重磅消息正在击打着很多人的心理防线。6月28日...  0.0   \n",
       "17           18  快蚁全球首款智能炒币机器人\\n \\n\\n AntBeast科技公司(香港)成立于2016年...  0.0   \n",
       "18           19  赶快解放双手！快蚁区块链智能交易\\n \\n 人们常说，金融交易中无论是股票、外汇还是加密货币...  1.0   \n",
       "19           20  快蚁区块链智能交易系统到底有多神奇，让无数炒币人疯狂追捧\\n \\n\\n 有很多人认为在数字货...  1.0   \n",
       "20           21  包头火电矿场招矿机托管\\n \\n\\n 量大有优惠\\n \\n\\n 不收矿机上机费及管理费\\n ...  1.0   \n",
       "21           22  这个项目不像空气币或者传销币那样子，本人亲自玩儿了两个月的项目，大概这一周他们还会推新的项目...  1.0   \n",
       "22           23                                        zm328507854  1.0   \n",
       "23           24  区块链安全性的工作方式如下：平台上的每个交互都被记录下来，并且事务中的每条记录都与前一条记录...  0.0   \n",
       "24           25                                                     1.0   \n",
       "25           26                                 人民日报为促进区块链产业健康发展发声  0.0   \n",
       "26           27  共识机制是维护系统的运作顺序和公平性的机制，决定这区块链这个系统里的区块被准确添加到链中，确...  0.0   \n",
       "27           28                                                     1.0   \n",
       "28           29  区块链是21世纪最前沿的现象级概念，（加 jingshui902，一起加群来交流吧）概述区块...  0.0   \n",
       "29           30  币圈众筹是由ICO发展而来的，2017年9月4日起，国内禁止ICO行为，在各个网站和主流渠道...  0.0   \n",
       "...         ...                                                ...  ...   \n",
       "102760   720906                                              清爽的姑娘  NaN   \n",
       "102761   653682     降啊 使劲降\\n \\n 原来抽35的万宝路双爆\\n \\n 现在抽爱喜妈的\\n \\n 搬砖去了  NaN   \n",
       "102762  1545747  叙利亚政府军不断收复失地，即将对叛军控制的伊德利卜省发起攻势。美国总统特朗普日前发推警告叙利...  NaN   \n",
       "102763   306294                                     楼主文章不错！值得投票点赞！  NaN   \n",
       "102764   107756                                             学习，支持。  NaN   \n",
       "102765   336596                                             楼上说的对。  NaN   \n",
       "102766   511899                                            都是一堆问题哪  NaN   \n",
       "102767    90851                                          不错。大气的老板。  NaN   \n",
       "102768   518328  中新经纬客户端10月27日电 房地产市场热度消散，近期公布的相关数据已有所体现。易居研究院2...  NaN   \n",
       "102769   190150                                        希望天涯越来越好，盼回  NaN   \n",
       "102770   154462                                爱天涯，爱洛洛，没抢到红包也支持楼主！  NaN   \n",
       "102771  1565787                        美国逐步废除自己建立的国际秩序漏出了他本就狰狞的面目。  NaN   \n",
       "102772   579754                                                 没钱  NaN   \n",
       "102773   758785                                    见到帖就想回，不回不痛快！！！  NaN   \n",
       "102774    23731                          口令  一起去看海\\n   \\n\\n    抢红包  NaN   \n",
       "102775   835774                          都是用娘老子的钱贷款买的豪车，没什么好羡慕的！！！  NaN   \n",
       "102776  1203600               阿庆剑指和珅，此时子时已到，这一天又过去了\\n \\n 和珅生命剩余25天  NaN   \n",
       "102777   546776  硬碰硬的正面决斗3\\n \\n\\n 一直紧盯着“两洋联盟”接应舰队进攻阵容的“曼莱督”一级军令...  NaN   \n",
       "102778   279535                         恭喜发财，大吉大利！\\n   \\n\\n    抢红包  NaN   \n",
       "102779   254303                            楼主的帖子我认证看了一遍，楼主文采是如此的出众  NaN   \n",
       "102780   556800                                         说得好且对！福兄好！  NaN   \n",
       "102781  1428148                         小偷哪里都有，不过为了一块钱的东西打成这样真的罕见。  NaN   \n",
       "102782   902297  “大姐，您怎么了？”百草堂内一名女导购员，立刻跑过来着急问。\\n \\n\\n “快叫救护车。”...  NaN   \n",
       "102783     5826                         来吧~~15连击点，我会回点~~每天都可以相互点哟！  NaN   \n",
       "102784   423435                             说一说看到的新闻吧，对于虚拟币一定要理性看待  NaN   \n",
       "102785   929590  小时候一直期盼着长大，可以做喜欢的事，长大后才知道童年是最幸福的时光。非常喜欢张嘉佳这个作者...  NaN   \n",
       "102786   740720                                              昆山标杆！  NaN   \n",
       "102787  1217125                                             这尺度真是大  NaN   \n",
       "102788   400845  @潇洒大佬 2018-09-13 09:09:25\\n \\n 支持楼主。填楼\\n \\n --...  NaN   \n",
       "102789   228576                                   已经点赞楼主，请楼主记得友情回赞  NaN   \n",
       "\n",
       "        assure   ss  \n",
       "0            1  1.0  \n",
       "1            1  1.0  \n",
       "2            1  0.0  \n",
       "3            1  1.0  \n",
       "4            1  1.0  \n",
       "5            1  0.0  \n",
       "6            1  1.0  \n",
       "7            1  1.0  \n",
       "8            1  1.0  \n",
       "9            1  0.0  \n",
       "10           1  0.0  \n",
       "11           1  1.0  \n",
       "12           1  0.0  \n",
       "13           1  0.0  \n",
       "14           1  1.0  \n",
       "15           1  0.0  \n",
       "16           1  0.0  \n",
       "17           1  0.0  \n",
       "18           1  1.0  \n",
       "19           1  1.0  \n",
       "20           1  1.0  \n",
       "21           1  1.0  \n",
       "22           1  1.0  \n",
       "23           1  0.0  \n",
       "24           1  1.0  \n",
       "25           1  0.0  \n",
       "26           1  0.0  \n",
       "27           1  1.0  \n",
       "28           1  0.0  \n",
       "29           1  0.0  \n",
       "...        ...  ...  \n",
       "102760       0 -1.0  \n",
       "102761       0 -1.0  \n",
       "102762       0 -1.0  \n",
       "102763       0 -1.0  \n",
       "102764       0 -1.0  \n",
       "102765       0 -1.0  \n",
       "102766       0 -1.0  \n",
       "102767       0 -1.0  \n",
       "102768       0 -1.0  \n",
       "102769       0 -1.0  \n",
       "102770       0 -1.0  \n",
       "102771       0 -1.0  \n",
       "102772       0 -1.0  \n",
       "102773       0 -1.0  \n",
       "102774       0 -1.0  \n",
       "102775       0 -1.0  \n",
       "102776       0 -1.0  \n",
       "102777       0 -1.0  \n",
       "102778       0 -1.0  \n",
       "102779       0 -1.0  \n",
       "102780       0 -1.0  \n",
       "102781       0 -1.0  \n",
       "102782       0 -1.0  \n",
       "102783       0 -1.0  \n",
       "102784       0 -1.0  \n",
       "102785       0 -1.0  \n",
       "102786       0 -1.0  \n",
       "102787       0 -1.0  \n",
       "102788       0 -1.0  \n",
       "102789       0 -1.0  \n",
       "\n",
       "[102790 rows x 5 columns]"
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "inds = np.arange(len(unlabeled_data))\n",
    "np.random.shuffle(inds)\n",
    "current = labled_data.append(unlabeled_data.iloc[inds[:2048]])\n",
    "current['ss'] = list(map(lambda x: -1 if x[0] < 0.5 else x[1], zip(current['assure'], current['tag'])))\n",
    "current = current.set_index(np.arange(len(current)))\n",
    "current"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "prepared\n"
     ]
    }
   ],
   "source": [
    "X_current = list(map(model.infer_vector, current['content'].values))\n",
    "y_current = current['ss'].values\n",
    "\n",
    "print(\"prepared\")\n",
    "\n",
    "lp_model = LabelSpreading(max_iter=128, tol=1e-3, n_jobs=-1)\n",
    "lp_model_fitted = lp_model.fit(X_current, y_current)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "pred_entropies = stats.distributions.entropy(lp_model.label_distributions_.T)\n",
    "current['pred_entropies'] = pred_entropies\n",
    "sortted = current.sort_values('pred_entropies', ascending=False).set_index(np.arange(len(current)))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "shuirows = sortted.loc[0:40]\n",
    "\n",
    "shuirows"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "shui = shuirows.content.values\n",
    "pprint(list(set(shui)))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "bushui = []"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "print('shui =', list(shui))\n",
    "print('bushui =', list(bushui))"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
